/*
 * Copyright (c) 2015, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/** \file
 * \brief Rose Build: functions for reducing the size of the Rose graph
 * through merging.
 */
#include "rose_build_merge.h"

#include "grey.h"
#include "rose_build.h"
#include "rose_build_impl.h"
#include "rose_build_util.h"
#include "ue2common.h"
#include "nfa/castlecompile.h"
#include "nfa/goughcompile.h"
#include "nfa/limex_limits.h"
#include "nfa/mcclellancompile.h"
#include "nfa/nfa_build_util.h"
#include "nfa/rdfa_merge.h"
#include "nfagraph/ng_holder.h"
#include "nfagraph/ng_haig.h"
#include "nfagraph/ng_is_equal.h"
#include "nfagraph/ng_lbr.h"
#include "nfagraph/ng_limex.h"
#include "nfagraph/ng_mcclellan.h"
#include "nfagraph/ng_puff.h"
#include "nfagraph/ng_redundancy.h"
#include "nfagraph/ng_repeat.h"
#include "nfagraph/ng_reports.h"
#include "nfagraph/ng_restructuring.h"
#include "nfagraph/ng_stop.h"
#include "nfagraph/ng_uncalc_components.h"
#include "nfagraph/ng_util.h"
#include "nfagraph/ng_width.h"
#include "util/bitutils.h"
#include "util/charreach.h"
#include "util/compile_context.h"
#include "util/container.h"
#include "util/dump_charclass.h"
#include "util/graph_range.h"
#include "util/order_check.h"
#include "util/report_manager.h"
#include "util/ue2string.h"

#include <algorithm>
#include <functional>
#include <list>
#include <map>
#include <queue>
#include <set>
#include <string>
#include <vector>
#include <utility>

#include <boost/functional/hash/hash_fwd.hpp>
#include <boost/range/adaptor/map.hpp>

using namespace std;
using boost::adaptors::map_values;
using boost::hash_combine;

namespace ue2 {

static const size_t NARROW_START_MAX = 10;
static const size_t SMALL_MERGE_MAX_VERTICES_STREAM = 128;
static const size_t SMALL_MERGE_MAX_VERTICES_BLOCK = 64;
static const size_t SMALL_ROSE_THRESHOLD_STREAM = 32;
static const size_t SMALL_ROSE_THRESHOLD_BLOCK = 10;
static const size_t MERGE_GROUP_SIZE_MAX = 200;

/** \brief Max number of DFAs (McClellan, Haig) to pairwise merge together. */
static const size_t DFA_CHUNK_SIZE_MAX = 200;

/** \brief Max DFA states in a merged DFA. */
static const size_t DFA_MERGE_MAX_STATES = 8000;

/** \brief An LBR must have at least this many vertices to be protected from
 * merging with other graphs. */
static const size_t LARGE_LBR_MIN_VERTICES = 32;


static
size_t small_merge_max_vertices(const CompileContext &cc) {
    return cc.streaming ? SMALL_MERGE_MAX_VERTICES_STREAM
                        : SMALL_MERGE_MAX_VERTICES_BLOCK;
}

static
size_t small_rose_threshold(const CompileContext &cc) {
    return cc.streaming ? SMALL_ROSE_THRESHOLD_STREAM
                        : SMALL_ROSE_THRESHOLD_BLOCK;
}

static
bool isLargeLBR(const NGHolder &g, const Grey &grey) {
    if (num_vertices(g) < LARGE_LBR_MIN_VERTICES) {
        return false;
    }
    return isLBR(g, grey);
}

namespace {
struct DupeLeafKey {
    explicit DupeLeafKey(const RoseVertexProps &litv)
        : literals(litv.literals), reports(litv.reports),
          eod_accept(litv.eod_accept), suffix(litv.suffix), left(litv.left),
          som_adjust(litv.som_adjust) {
        DEBUG_PRINTF("eod_accept %d\n", (int)eod_accept);
        DEBUG_PRINTF("report %u\n", left.leftfix_report);
        DEBUG_PRINTF("lag %u\n", left.lag);
    }

    bool operator<(const DupeLeafKey &b) const {
        const DupeLeafKey &a = *this;
        ORDER_CHECK(literals);
        ORDER_CHECK(eod_accept);
        ORDER_CHECK(suffix);
        ORDER_CHECK(reports);
        ORDER_CHECK(som_adjust);
        ORDER_CHECK(left.leftfix_report);
        ORDER_CHECK(left.lag);
        return false;
    }

    flat_set<u32> literals;
    flat_set<ReportID> reports;
    bool eod_accept;
    suffix_id suffix;
    LeftEngInfo left;
    u32 som_adjust;
};

struct UncalcLeafKey {
    UncalcLeafKey(const RoseGraph &g, RoseVertex v)
        : literals(g[v].literals), rose(g[v].left) {
        for (const auto &e : in_edges_range(v, g)) {
            RoseVertex u = source(e, g);
            preds.insert(make_pair(u, g[e]));
        }
    }

    bool operator<(const UncalcLeafKey &b) const {
        const UncalcLeafKey &a = *this;
        ORDER_CHECK(literals);
        ORDER_CHECK(preds);
        ORDER_CHECK(rose);
        return false;
    }

    flat_set<u32> literals;
    flat_set<pair<RoseVertex, RoseEdgeProps>> preds;
    LeftEngInfo rose;
};
} // namespace

/**
 * This function merges leaf vertices with the same literals and report
 * id/suffix. The leaf vertices of the graph are inspected and a mapping of
 * leaf vertex properties to vertices is built. If the same set of leaf
 * properties has already been seen when we inspect a vertex, we attempt to
 * merge the vertex in with the previously seen vertex. This process can fail
 * if the vertices share a common predecessor vertex but have a differing,
 * incompatible relationship (different bounds or infix) with the predecessor.
 *
 * This takes place after \ref dedupeSuffixes to increase effectiveness as the
 * same suffix is required for a merge to occur.
 */
void mergeDupeLeaves(RoseBuildImpl &tbi) {
    map<DupeLeafKey, RoseVertex> leaves;
    vector<RoseVertex> changed;

    RoseGraph &g = tbi.g;
    for (auto v : vertices_range(g)) {
        if (in_degree(v, g) == 0) {
            assert(tbi.isAnyStart(v));
            continue;
        }

        DEBUG_PRINTF("inspecting vertex idx=%zu in_degree %zu out_degree %zu\n",
                     g[v].idx, in_degree(v, g), out_degree(v, g));

        // Vertex must be a reporting leaf node
        if (g[v].reports.empty() || !isLeafNode(v, g)) {
            continue;
        }

        // At the moment, we ignore all successors of root or anchored_root,
        // since many parts of our runtime assume that these have in-degree 1.
        if (tbi.isRootSuccessor(v)) {
            continue;
        }

        DupeLeafKey dupe(g[v]);
        if (leaves.find(dupe) == leaves.end()) {
            leaves.insert(make_pair(dupe, v));
            continue;
        }

        RoseVertex t = leaves.find(dupe)->second;
        DEBUG_PRINTF("found two leaf dupe roles, idx=%zu,%zu\n", g[v].idx,
                     g[t].idx);

        vector<RoseEdge> deadEdges;
        for (const auto &e : in_edges_range(v, g)) {
            RoseVertex u = source(e, g);
            DEBUG_PRINTF("u idx=%zu\n", g[u].idx);
            RoseEdge et;
            bool exists;
            tie (et, exists) = edge(u, t, g);
            if (exists) {
                if (g[et].minBound <= g[e].minBound
                    && g[et].maxBound >= g[e].maxBound) {
                    DEBUG_PRINTF("remove more constrained edge\n");
                    deadEdges.push_back(e);
                }
            } else {
                DEBUG_PRINTF("rehome edge: add %zu->%zu\n", g[u].idx, g[t].idx);
                add_edge(u, t, g[e], g);
                deadEdges.push_back(e);
            }
        }

        if (!deadEdges.empty()) {
            for (auto &e : deadEdges) {
                remove_edge(e, g);
            }
            changed.push_back(v);
            g[t].min_offset = min(g[t].min_offset, g[v].min_offset);
            g[t].max_offset = max(g[t].max_offset, g[v].max_offset);
        }
    }
    DEBUG_PRINTF("find loop done\n");

    // Remove any vertices that now have no in-edges.
    size_t countRemovals = 0;
    for (size_t i = 0; i < changed.size(); i++) {
        RoseVertex v = changed[i];
        if (in_degree(v, g) == 0) {
            DEBUG_PRINTF("remove vertex\n");
            if (!tbi.isVirtualVertex(v)) {
                for (u32 lit_id : g[v].literals) {
                    tbi.literal_info[lit_id].vertices.erase(v);
                }
            }
            remove_vertex(v, g);
            countRemovals++;
        }
    }

    // if we've removed anything, we need to renumber vertices
    if (countRemovals) {
        tbi.renumberVertices();
        DEBUG_PRINTF("removed %zu vertices.\n", countRemovals);
    }
}

/** Merges the suffixes on the (identical) vertices in \a vcluster, used by
 * \ref uncalcLeaves. */
static
void mergeCluster(RoseGraph &g, const ReportManager &rm,
                  const vector<RoseVertex> &vcluster,
                  vector<RoseVertex> &dead, const CompileContext &cc) {
    if (vcluster.size() <= 1) {
        return; // No merge to perform.
    }

    // Note that we batch merges up fairly crudely for performance reasons.
    vector<RoseVertex>::const_iterator it = vcluster.begin(), it2;
    while (it != vcluster.end()) {
        vector<NGHolder *> cluster;
        map<NGHolder *, RoseVertex> rev;

        for (it2 = it;
             it2 != vcluster.end() && cluster.size() < MERGE_GROUP_SIZE_MAX;
             ++it2) {
            RoseVertex v = *it2;
            NGHolder *h = g[v].suffix.graph.get();
            assert(!g[v].suffix.haig); /* should not be here if haig */
            rev[h] = v;
            cluster.push_back(h);
        }
        it = it2;

        DEBUG_PRINTF("merging cluster %zu\n", cluster.size());
        map<NGHolder *, NGHolder *> merged;
        mergeNfaCluster(cluster, &rm, merged, cc);
        DEBUG_PRINTF("done\n");

        for (const auto &m : merged) {
            NGHolder *h_victim = m.first; // mergee
            NGHolder *h_winner = m.second;
            RoseVertex victim = rev[h_victim];
            RoseVertex winner = rev[h_winner];

            LIMIT_TO_AT_MOST(&g[winner].min_offset, g[victim].min_offset);
            ENSURE_AT_LEAST(&g[winner].max_offset, g[victim].max_offset);
            insert(&g[winner].reports, g[victim].reports);

            dead.push_back(victim);
        }
    }
}

static
void findUncalcLeavesCandidates(RoseBuildImpl &tbi,
                           map<UncalcLeafKey, vector<RoseVertex> > &clusters,
                           deque<UncalcLeafKey> &ordered) {
    const RoseGraph &g = tbi.g;

    vector<NFAVertex> suffix_vertices; // vertices with suffix graphs
    ue2::unordered_map<const NGHolder *, u32> fcount; // ref count per graph

    for (auto v : vertices_range(g)) {
        if (g[v].suffix) {
            if (!g[v].suffix.graph) {
                continue; /* cannot uncalc (haig/mcclellan); TODO */
            }

            assert(g[v].suffix.graph->kind == NFA_SUFFIX);

            // Ref count all suffixes, as we don't want to merge a suffix
            // that happens to be shared with a non-leaf vertex somewhere.
            DEBUG_PRINTF("vertex %zu has suffix %p\n", g[v].idx,
                         g[v].suffix.graph.get());
            fcount[g[v].suffix.graph.get()]++;

            // Vertex must be a reporting pseudo accept
            if (!isLeafNode(v, g)) {
                continue;
            }

            suffix_vertices.push_back(v);
        }
    }

    for (auto v : suffix_vertices) {
        if (in_degree(v, g) == 0) {
            assert(tbi.isAnyStart(v));
            continue;
        }

        const NGHolder *h = g[v].suffix.graph.get();
        assert(h);
        DEBUG_PRINTF("suffix %p\n", h);

        // We can't easily merge suffixes shared with other vertices, and
        // creating a unique copy to do so may just mean we end up tracking
        // more NFAs. Better to leave shared suffixes alone.
        if (fcount[h] != 1) {
            DEBUG_PRINTF("skipping shared suffix\n");
            continue;
        }

        UncalcLeafKey key(g, v);
        vector<RoseVertex> &vec = clusters[key];
        if (vec.empty()) {

            ordered.push_back(key);
        }
        vec.push_back(v);
    }

    DEBUG_PRINTF("find loop done\n");
}

/**
 * This function attempts to combine identical roles (same literals, same
 * predecessors, etc) with different suffixes into a single role which
 * activates a larger suffix. The leaf vertices of the graph with a suffix are
 * grouped into clusters which have members triggered by identical roles. The
 * \ref mergeNfaCluster function (from ng_uncalc_components) is then utilised
 * to build a set of larger (and still implementable) suffixes. The graph is
 * then updated to point to the new suffixes and any unneeded roles are
 * removed.
 *
 * Note: suffixes which are shared amongst multiple roles are not considered
 * for this pass as the individual suffixes would have to continue to exist for
 * the other roles to trigger resulting in the transformation not producing any
 * savings.
 *
 * Note: as \ref mergeNfaCluster is slow when the cluster sizes are large,
 * clusters of more than \ref MERGE_GROUP_SIZE_MAX roles are split into smaller
 * chunks for processing.
 */
void uncalcLeaves(RoseBuildImpl &tbi) {
    DEBUG_PRINTF("uncalcing\n");

    map<UncalcLeafKey, vector<RoseVertex> > clusters;
    deque<UncalcLeafKey> ordered;
    findUncalcLeavesCandidates(tbi, clusters, ordered);

    vector<RoseVertex> dead;

    for (const auto &key : ordered) {
        DEBUG_PRINTF("cluster of size %zu\n", clusters[key].size());
        mergeCluster(tbi.g, tbi.rm, clusters[key], dead, tbi.cc);
    }
    tbi.removeVertices(dead);
}

/**
 * Returns a loose hash of a leftfix for use in dedupeLeftfixes. Note that
 * reports should not contribute to the hash.
 */
static
size_t hashLeftfix(const LeftEngInfo &left) {
    size_t val = 0;

    if (left.castle) {
        hash_combine(val, left.castle->reach());
        for (const auto &pr : left.castle->repeats) {
            hash_combine(val, pr.first); // top
            hash_combine(val, pr.second.bounds);
        }
    } else if (left.graph) {
        hash_combine(val, hash_holder(*left.graph));
    }

    return val;
}

namespace {

/** Key used to group sets of leftfixes by the dedupeLeftfixes path. */
struct RoseGroup {
    RoseGroup(const RoseBuildImpl &build, RoseVertex v)
        : left_hash(hashLeftfix(build.g[v].left)),
          lag(build.g[v].left.lag), eod_table(build.isInETable(v)) {
        const RoseGraph &g = build.g;
        assert(in_degree(v, g) == 1);
        RoseVertex u = *inv_adjacent_vertices(v, g).first;
        parent = g[u].idx;
    }

    bool operator<(const RoseGroup &b) const {
        const RoseGroup &a = *this;
        ORDER_CHECK(parent);
        ORDER_CHECK(left_hash);
        ORDER_CHECK(lag);
        ORDER_CHECK(eod_table);
        return false;
    }

private:
    /** Parent vertex index. We must use the index, rather than the descriptor,
     * for compile determinism. */
    size_t parent;

    /** Quick hash of the leftfix itself. Must be identical for a given pair of
     * graphs if is_equal would return true. */
    size_t left_hash;

    /** Leftfix lag value. */
    u32 lag;

    /** True if associated vertex (successor) is in the EOD table. We don't
     * allow sharing of leftfix engines between "normal" and EOD operation. */
    bool eod_table;
};

/**
 * Trivial Rose comparator intended to find graphs that are identical except
 * for their report IDs. Relies on vertex and edge indices to pick up graphs
 * that have been messily put together in different orderings...
 */
struct RoseComparator {
    explicit RoseComparator(const RoseGraph &g_in) : g(g_in) {}

    bool operator()(const RoseVertex u, const RoseVertex v) const {
        const LeftEngInfo &u_left = g[u].left;
        const LeftEngInfo &v_left = g[v].left;

        if (u_left.castle && v_left.castle) {
            return is_equal(*u_left.castle, u_left.leftfix_report,
                            *v_left.castle, v_left.leftfix_report);
        }

        if (!u_left.graph || !v_left.graph) {
            return false;
        }

        return is_equal(*u_left.graph, u_left.leftfix_report, *v_left.graph,
                        v_left.leftfix_report);
    }

private:
    const RoseGraph &g;
};

} // namespace

/**
 * This pass performs work similar to \ref dedupeSuffixes - it removes
 * duplicate prefix/infixes (that is, leftfixes) which are identical graphs and
 * share the same trigger vertex and lag. Leftfixes are first grouped by
 * parent role and lag to reduce the number of candidates to be inspected
 * for each leftfix. The graphs in each cluster are then compared with each
 * other and the graph is updated to only refer to a canonical version of each
 * graph.
 *
 * Note: only roles with a single predecessor vertex are considered for this
 * transform - it should probably be generalised to work for roles which share
 * the same set of predecessor roles as for \ref dedupeLeftfixesVariableLag or it
 * should be retired entirely.
 */
bool dedupeLeftfixes(RoseBuildImpl &tbi) {
    DEBUG_PRINTF("deduping leftfixes\n");
    map<RoseGroup, deque<RoseVertex>> roses;
    bool work_done = false;

    /* Note: a leftfix's transientness will not be altered by deduping */

    // Collect leftfixes into groups.
    RoseGraph &g = tbi.g;
    for (auto v : vertices_range(g)) {
        if (!g[v].left) {
            continue;
        }
        const left_id left(g[v].left);

        if (left.haig()) {
            /* TODO: allow merging of identical haigs */
            continue;
        }

        if (in_degree(v, g) != 1) {
            continue;
        }

        roses[RoseGroup(tbi, v)].push_back(v);
    }

    DEBUG_PRINTF("collected %zu rose groups\n", roses.size());

    const RoseComparator rosecmp(g);

    // Walk groups and dedupe the roses therein.
    for (deque<RoseVertex> &verts : roses | map_values) {
        DEBUG_PRINTF("group has %zu vertices\n", verts.size());

        ue2::unordered_set<left_id> seen;

        for (auto jt = verts.begin(), jte = verts.end(); jt != jte; ++jt) {
            RoseVertex v = *jt;
            left_id left(g[v].left);

            // Skip cases we've already handled, and mark as seen otherwise.
            if (!seen.insert(left).second) {
                continue;
            }

            // Scan the rest of the list for dupes.
            for (auto kt = next(jt); kt != jte; ++kt) {
                if (g[v].left == g[*kt].left || !rosecmp(v, *kt)) {
                    continue;
                }

                // Dupe found.
                DEBUG_PRINTF("rose at vertex %zu is a dupe of %zu\n",
                             g[*kt].idx, g[v].idx);
                assert(g[v].left.lag == g[*kt].left.lag);
                g[*kt].left = g[v].left;
                work_done = true;
            }
        }
    }

    return work_done;
}

/**
 * \brief Returns a numeric key that can be used to group this suffix with
 * others that may be its duplicate.
 */
static
size_t suffix_size_key(const suffix_id &s) {
    if (s.graph()) {
        return num_vertices(*s.graph());
    }
    if (s.castle()) {
        return s.castle()->repeats.size();
    }
    return 0;
}

static
bool is_equal(const suffix_id &s1, const suffix_id &s2) {
    if (s1.graph() && s2.graph()) {
        return is_equal(*s1.graph(), *s2.graph());
    } else if (s1.castle() && s2.castle()) {
        return is_equal(*s1.castle(), *s2.castle());
    }
    return false;
}

/**
 * This function simply looks for suffix NGHolder graphs which are identical
 * and updates the roles in the RoseGraph to refer to only a single copy. This
 * obviously has benefits in terms of both performance (as we don't run
 * multiple engines doing the same work) and stream state. This function first
 * groups all suffixes by number of vertices and report set to restrict the set
 * of possible candidates. Each group is then walked to find duplicates using
 * the \ref is_equal comparator for NGHolders and updating the RoseGraph as it
 * goes.
 *
 * Note: does not dedupe suffixes of vertices in the EOD table.
 */
void dedupeSuffixes(RoseBuildImpl &tbi) {
    DEBUG_PRINTF("deduping suffixes\n");

    ue2::unordered_map<suffix_id, set<RoseVertex>> suffix_map;
    map<pair<size_t, set<ReportID>>, vector<suffix_id>> part;

    // Collect suffixes into groups.
    RoseGraph &g = tbi.g;
    for (auto v : vertices_range(g)) {
        if (!g[v].suffix || tbi.isInETable(v)) {
            continue;
        }

        const suffix_id s(g[v].suffix);

        if (!(s.graph() || s.castle())) {
            continue; // e.g. Haig
        }

        set<RoseVertex> &verts = suffix_map[s];
        if (verts.empty()) {
            part[make_pair(suffix_size_key(s), all_reports(s))].push_back(s);
        }
        verts.insert(v);
    }

    DEBUG_PRINTF("collected %zu groups\n", part.size());

    for (const auto &cand : part | map_values) {
        if (cand.size() <= 1) {
            continue;
        }
        DEBUG_PRINTF("deduping cand set of size %zu\n", cand.size());

        for (auto jt = cand.begin(); jt != cand.end(); ++jt) {
            if (suffix_map[*jt].empty()) {
                continue;
            }
            for (auto kt = next(jt); kt != cand.end(); ++kt) {
                if (suffix_map[*kt].empty() || !is_equal(*jt, *kt)) {
                    continue;
                }
                DEBUG_PRINTF("found dupe\n");
                for (auto v : suffix_map[*kt]) {
                    RoseVertex dupe = *suffix_map[*jt].begin();
                    assert(dupe != v);
                    g[v].suffix.graph = g[dupe].suffix.graph;
                    g[v].suffix.castle = g[dupe].suffix.castle;
                    assert(suffix_id(g[v].suffix) ==
                           suffix_id(g[dupe].suffix));
                    suffix_map[*jt].insert(v);
                }
                suffix_map[*kt].clear();
            }
        }
    }
}

namespace {

/**
 * This class stores a mapping from an engine reference (left_id, suffix_id,
 * etc) to a list of vertices, and also allows us to iterate over the set of
 * engine references in insertion order -- we add to the mapping in vertex
 * iteration order, so this allows us to provide a consistent ordering.
 */
template<class EngineRef>
class Bouquet {
private:
    list<EngineRef> ordering; // Unique list in insert order.
    typedef ue2::unordered_map<EngineRef, deque<RoseVertex> > BouquetMap;
    BouquetMap bouquet;
public:
    void insert(const EngineRef &h, RoseVertex v) {
        typename BouquetMap::iterator f = bouquet.find(h);
        if (f == bouquet.end()) {
            ordering.push_back(h);
            bouquet[h].push_back(v);
        } else {
            f->second.push_back(v);
        }
    }

    void insert(const EngineRef &h, const deque<RoseVertex> &verts) {
        typename BouquetMap::iterator f = bouquet.find(h);
        if (f == bouquet.end()) {
            ordering.push_back(h);
            bouquet.insert(make_pair(h, verts));
        } else {
            f->second.insert(f->second.end(), verts.begin(), verts.end());
        }
    }

    const deque<RoseVertex> &vertices(const EngineRef &h) const {
        typename BouquetMap::const_iterator it = bouquet.find(h);
        assert(it != bouquet.end()); // must be present
        return it->second;
    }

    void erase(const EngineRef &h) {
        assert(bouquet.find(h) != bouquet.end());
        bouquet.erase(h);
        ordering.remove(h);
    }

    /** Remove all the elements in the given iterator range. */
    template <class Iter>
    void erase_all(Iter erase_begin, Iter erase_end) {
        for (Iter it = erase_begin; it != erase_end; ++it) {
            bouquet.erase(*it);
        }

        // Use a quick-lookup container so that we only have to traverse the
        // 'ordering' list once.
        const set<EngineRef> dead(erase_begin, erase_end);
        for (iterator it = begin(); it != end(); /* incremented inside */) {
            if (contains(dead, *it)) {
                ordering.erase(it++);
            } else {
                ++it;
            }
        }
    }

    void clear() {
        ordering.clear();
        bouquet.clear();
    }

    size_t size() const { return bouquet.size(); }

    // iterate over holders in insert order
    typedef typename list<EngineRef>::iterator iterator;
    iterator begin() { return ordering.begin(); }
    iterator end() { return ordering.end(); }

    // const iterate over holders in insert order
    typedef typename list<EngineRef>::const_iterator const_iterator;
    const_iterator begin() const { return ordering.begin(); }
    const_iterator end() const { return ordering.end(); }
};

typedef Bouquet<left_id> RoseBouquet;
typedef Bouquet<suffix_id> SuffixBouquet;

} // namespace

/**
 * Split a \ref Bouquet of some type into several smaller ones.
 */
template <class EngineRef>
static void chunkBouquets(const Bouquet<EngineRef> &in,
                          deque<Bouquet<EngineRef>> &out,
                          const size_t chunk_size) {
    if (in.size() <= chunk_size) {
        out.push_back(in);
        return;
    }

    out.push_back(Bouquet<EngineRef>());
    for (const auto &engine : in) {
        if (out.back().size() >= chunk_size) {
            out.push_back(Bouquet<EngineRef>());
        }
        out.back().insert(engine, in.vertices(engine));
    }
}

/**
 * Prefix analysis: For lit1 with delay1 and lit2 with delay2, let L be the
 * length of the largest suffix of lit1 that is a prefix of lit2. A merge is
 * bad if L - delay1 > len(lit2) - delay2.
 *
 * OR if we would have to check 2 literals of differing lags at the same
 * point.
 */
static
bool checkPrefix(const rose_literal_id &ul, const u32 ulag,
                 const rose_literal_id &vl, const u32 vlag) {
    DEBUG_PRINTF("%s %s\n", escapeString(ul.s).c_str(),
                 escapeString(vl.s).c_str());
    if (ulag != vlag && (vl.delay || ul.delay || isSuffix(ul.s, vl.s))) {
        /* rose literals should not be delayed anyway */
        return false;
    }

    // Note that maxOverlap also picks up infixes.
    size_t overlap = maxOverlap(ul, vl);
    if (overlap < ulag) {
        return true; /* avoiding underflow */
    }
    return overlap - ulag <= vl.elength() - vlag;
}

bool mergeableRoseVertices(const RoseBuildImpl &tbi, RoseVertex u,
                           RoseVertex v) {
    assert(u != v);

    const auto &ulits = tbi.g[u].literals;
    const auto &vlits = tbi.g[v].literals;

    // We cannot merge roses that prefix literals in different tables.
    if (tbi.literals.right.at(*ulits.begin()).table !=
            tbi.literals.right.at(*vlits.begin()).table) {
        DEBUG_PRINTF("literals in different tables\n");
        return false;
    }

    const left_id u_left(tbi.g[u].left), v_left(tbi.g[v].left);

    if (u_left.haig() || v_left.haig()) {
        if (u_left.graph() != v_left.graph()) {
            return false;
        }
    }

    if (u_left.dfa() || v_left.dfa()) {
        if (u_left.graph() != v_left.graph()) {
            return false;
        }
    }

    if (u_left.castle() || v_left.castle()) {
        if (!u_left.castle() || !v_left.castle()) {
            return false; // Must both be castles.
        }
    }

    // UE-1675: in block mode, we want to be a little more selective -- only
    // merge prefix roses when the literal sets are the same.
    if (!tbi.cc.streaming && tbi.isRootSuccessor(u)) {
        assert(tbi.isRootSuccessor(v));
        if (ulits != vlits) {
            DEBUG_PRINTF("literals aren't identical (block mode prefix)\n");
            return false;
        }
    }

    /* We cannot merge prefixes/vertices if they are successors of different
     * root vertices */
    if (tbi.isRootSuccessor(u)) {
        assert(tbi.isRootSuccessor(v));
        set<RoseVertex> u_preds;
        set<RoseVertex> v_preds;
        insert(&u_preds, inv_adjacent_vertices(u, tbi.g));
        insert(&v_preds, inv_adjacent_vertices(v, tbi.g));

        if (u_preds != v_preds) {
            return false;
        }
    }

    // We accept any pair of literal sets A and B where no literal in A
    // contains a literal in B and no literal in B contains a literal in A.

    const u32 ulag = tbi.g[u].left.lag;
    const u32 vlag = tbi.g[v].left.lag;

    for (const u32 &ulit : ulits) {
        const rose_literal_id &ul = tbi.literals.right.at(ulit);

        if (ul.delay) {
            return false; // We don't handle delayed cases here.
        }

        for (const u32 &vlit : vlits) {
            const rose_literal_id &vl = tbi.literals.right.at(vlit);

            if (vl.delay) {
                return false; // We don't handle delayed cases here.
            }

            if (!checkPrefix(ul, ulag, vl, vlag) ||
                !checkPrefix(vl, vlag, ul, ulag)) {
                DEBUG_PRINTF("prefix check failed\n");
                return false;
            }
        }
    }

    DEBUG_PRINTF("roses on %zu and %zu are mergeable\n",
                 tbi.g[u].idx, tbi.g[v].idx);
    return true;
}

static
bool mergeableDelays(const RoseBuildImpl &tbi, const flat_set<u32> &ulits,
                     const flat_set<u32> &vlits, u32 vlag) {
    for (const u32 &ulit : ulits) {
        const rose_literal_id &ul = tbi.literals.right.at(ulit);
        assert(!ul.delay); // this should never have got this far?
        for (const u32 vlit : vlits) {
            const rose_literal_id &vl = tbi.literals.right.at(vlit);
            assert(!vl.delay); // this should never have got this far?

            DEBUG_PRINTF("%s %s (lag %u, overlap %zu)\n",
                         escapeString(ul.s).c_str(),
                         escapeString(vl.s).c_str(), vlag,
                         maxOverlap(ul, vl));
            size_t l = vl.elength() - maxOverlap(ul, vl);
            if (vlag > l) {
                DEBUG_PRINTF("failed lag check!\n");
                return false;
            }
        }
    }
    return true;
}

static
bool checkPredDelays(const RoseBuildImpl &tbi, const deque<RoseVertex> &v1,
                     const deque<RoseVertex> &v2) {
    set<RoseVertex> preds;
    for (auto v : v1) {
        insert(&preds, inv_adjacent_vertices(v, tbi.g));
    }

    for (auto u : preds) {
        const auto &pred_lits = tbi.g[u].literals;
        for (auto v : v2) {
            u32 vlag = tbi.g[v].left.lag;
            DEBUG_PRINTF("consider (%zu, %zu) lag=%u\n", tbi.g[u].idx,
                         tbi.g[v].idx, vlag);
            if (edge_by_target(u, v, tbi.g).second) {
                /* no need to examine delays as it is a common pred - so checks
                 * must already have survived the delay checks.
                 * This is important when the pred is in the anchored table as
                 * the literal is no longer available. */
                DEBUG_PRINTF("ok, also %zu is also a pred of %zu\n",
                             tbi.g[u].idx, tbi.g[v].idx);
                continue;
            }
            if (!mergeableDelays(tbi, pred_lits, tbi.g[v].literals, vlag)) {
                return false;
            }
        }
    }

    return true;
}

static
bool mergeableRoseVertices(const RoseBuildImpl &tbi,
                           const deque<RoseVertex> &verts1,
                           const deque<RoseVertex> &verts2) {
    for (auto v1 : verts1) {
        for (auto v2 : verts2) {
            if (!mergeableRoseVertices(tbi, v1, v2)) {
                return false;
            }
        }
    }

    // Check preds are compatible as well.
    if (!checkPredDelays(tbi, verts1, verts2) ||
        !checkPredDelays(tbi, verts2, verts1)) {
        return false;
    }

    DEBUG_PRINTF("vertex sets are mergeable\n");
    return true;
}

bool mergeableRoseVertices(const RoseBuildImpl &tbi, const set<RoseVertex> &v1,
                           const set<RoseVertex> &v2) {
    const deque<RoseVertex> vv1(v1.begin(), v1.end());
    const deque<RoseVertex> vv2(v2.begin(), v2.end());
    return mergeableRoseVertices(tbi, vv1, vv2);
}

/** \brief Priority queue element for Rose merges. */
namespace {
struct RoseMergeCandidate {
    RoseMergeCandidate(const left_id &r1_in, const left_id &r2_in, u32 cpl_in,
                       u32 tb)
        : r1(r1_in), r2(r2_in), stopxor(0), cpl(cpl_in), states(0),
          tie_breaker(tb) {
        if (r1.graph() && r2.graph()) {
            const NGHolder &h1 = *r1.graph(), &h2 = *r2.graph();
            /* som_none as haigs don't merge and just a guiding heuristic */
            CharReach stop1 = findStopAlphabet(h1, SOM_NONE);
            CharReach stop2 = findStopAlphabet(h2, SOM_NONE);
            stopxor = (stop1 ^ stop2).count();

            // We use the number of vertices as an approximation of the state
            // count here, as this is just feeding a comparison.
            u32 vertex_count = num_vertices(h1) + num_vertices(h2);
            states = vertex_count - min(vertex_count, cpl);
        } else if (r1.castle() && r2.castle()) {
            // FIXME
        }
    }

    bool operator<(const RoseMergeCandidate &a) const {
        if (stopxor != a.stopxor) {
            return stopxor > a.stopxor;
        }
        if (cpl != a.cpl) {
            return cpl < a.cpl;
        }
        if (states != a.states) {
            return states > a.states;
        }
        return tie_breaker < a.tie_breaker;
    }

    left_id r1;
    left_id r2;
    u32 stopxor;
    u32 cpl; //!< common prefix length
    u32 states;
    u32 tie_breaker; //!< determinism
};
}

static
bool mergeRosePair(RoseBuildImpl &tbi, left_id &r1, left_id &r2,
                   const deque<RoseVertex> &verts1,
                   const deque<RoseVertex> &verts2) {
    assert(!verts1.empty() && !verts2.empty());

    RoseGraph &g = tbi.g;

    if (r1.graph()) {
        assert(r2.graph());
        assert(r1.graph()->kind == r2.graph()->kind);
        if (!mergeNfaPair(*r1.graph(), *r2.graph(), nullptr, tbi.cc)) {
            DEBUG_PRINTF("nfa merge failed\n");
            return false;
        }

        // The graph in r1 has been merged into the graph in r2. Update r1's
        // vertices with the new graph ptr. Since the parent vertices are the
        // same, we know that tops will already have been distinct.
        shared_ptr<NGHolder> &h = g[verts2.front()].left.graph;
        for (RoseVertex v : verts1) {
            g[v].left.graph = h;
        }

        return true;
    } else if (r1.castle()) {
        assert(r2.castle());
        assert(tbi.cc.grey.allowCastle);

        map<u32, u32> top_map;
        if (!mergeCastle(*r2.castle(), *r1.castle(), top_map)) {
            DEBUG_PRINTF("castle merge failed\n");
            return false;
        }

        // The castle in r1 has been merged into the castle in r2, with tops
        // remapped as per top_map.
        const shared_ptr<CastleProto> &c = g[verts2.front()].left.castle;
        for (RoseVertex v : verts1) {
            g[v].left.castle = c;
            for (const auto &e : in_edges_range(v, g)) {
                g[e].rose_top = top_map.at(g[e].rose_top);
            }
        }
        return true;
    }

    assert(0);
    return false;
}

static
void processMergeQueue(RoseBuildImpl &tbi, RoseBouquet &roses,
                       priority_queue<RoseMergeCandidate> &pq) {
    ue2::unordered_set<left_id> dead;

    DEBUG_PRINTF("merge queue has %zu entries\n", pq.size());

    while (!pq.empty()) {
        DEBUG_PRINTF("pq pop h1=%p, h2=%p, cpl=%u, states=%u\n",
                     pq.top().r1.graph(), pq.top().r2.graph(), pq.top().cpl,
                     pq.top().states);

        left_id r1 = pq.top().r1, r2 = pq.top().r2;
        pq.pop();

        if (contains(dead, r1) || contains(dead, r2)) {
            continue;
        }

        if (r1.graph() && r2.graph()) {
            NGHolder *h1 = r1.graph(), *h2 = r2.graph();
            CharReach stop1 = findStopAlphabet(*h1, SOM_NONE);
            CharReach stop2 = findStopAlphabet(*h2, SOM_NONE);
            CharReach stopboth(stop1 & stop2);
            DEBUG_PRINTF("stop1=%zu, stop2=%zu, stopboth=%zu\n", stop1.count(),
                         stop2.count(), stopboth.count());
            if (stopboth.count() < 10 &&
                (stop1.count() > 10 || stop2.count() > 10)) {
                DEBUG_PRINTF("skip merge, would kill stop alphabet\n");
                continue;
            }
            size_t maxstop = max(stop1.count(), stop2.count());
            if (maxstop > 200 && stopboth.count() < 200) {
                DEBUG_PRINTF("skip merge, would reduce stop alphabet\n");
                continue;
            }
        }

        const deque<RoseVertex> &verts1 = roses.vertices(r1);
        const deque<RoseVertex> &verts2 = roses.vertices(r2);

        if (!mergeableRoseVertices(tbi, verts1, verts2)) {
            continue;
        }

        if (!mergeRosePair(tbi, r1, r2, verts1, verts2)) {
            continue;
        }

        roses.insert(r2, verts1);
        roses.erase(r1);
        dead.insert(r1);
    }
}

static
bool nfaHasNarrowStart(const NGHolder &g) {
    if (hasGreaterOutDegree(1, g.startDs, g)) {
        return false; // unanchored
    }

    CharReach cr;

    for (auto v : adjacent_vertices_range(g.start, g)) {
        if (v == g.startDs) {
            continue;
        }
        cr |= g[v].char_reach;
    }
    return cr.count() <= NARROW_START_MAX;
}

static
bool nfaHasFiniteMaxWidth(const NGHolder &g) {
    return findMaxWidth(g).is_finite();
}

namespace {
struct RoseMergeKey {
    RoseMergeKey(const RoseVertexSet &parents_in,
                 bool narrowStart_in, bool hasMaxWidth_in) :
                        narrowStart(narrowStart_in),
                        hasMaxWidth(hasMaxWidth_in),
                        parents(parents_in) {}
    bool operator<(const RoseMergeKey &b) const {
        const RoseMergeKey &a = *this;
        ORDER_CHECK(narrowStart);
        ORDER_CHECK(hasMaxWidth);
        ORDER_CHECK(parents);
        return false;
    }

    // NOTE: these two bool discriminators are only used for prefixes, not
    // infixes.
    bool narrowStart;
    bool hasMaxWidth;

    RoseVertexSet parents;
};
}

static
bool hasReformedStartDotStar(const NGHolder &h, const Grey &grey) {
    if (!proper_out_degree(h.startDs, h)) {
        return false;
    }

    assert(!is_triggered(h));

    NGHolder h_temp;
    cloneHolder(h_temp, h);

    vector<BoundedRepeatData> repeats;
    bool suitable_for_sds_reforming = false;
    const map<u32, u32> fixed_depth_tops; /* not relevant for cfa check */
    const map<u32, vector<vector<CharReach>>> triggers; /* not for cfa check */
    const bool simple_model_selection = true; // FIRST is considered simple
    analyseRepeats(h_temp, nullptr, fixed_depth_tops, triggers, &repeats, true,
                   simple_model_selection, grey, &suitable_for_sds_reforming);

    return suitable_for_sds_reforming;
}

static
u32 commonPrefixLength(left_id &r1, left_id &r2) {
    if (r1.graph() && r2.graph()) {
        auto &g1 = *r1.graph();
        auto &g2 = *r2.graph();
        auto state_ids_1 = numberStates(g1);
        auto state_ids_2 = numberStates(g2);
        return commonPrefixLength(g1, state_ids_1, g2, state_ids_2);
    } else if (r1.castle() && r2.castle()) {
        return min(findMinWidth(*r1.castle()), findMinWidth(*r2.castle()));
    }
    return 0;
}

/**
 * This pass attempts to merge prefix/infix engines which share a common set of
 * parent vertices.
 *
 * Engines are greedily merged pairwise by this process based on a priority
 * queue keyed off the common prefix length.
 *
 * Engines are not merged if the lags are not compatible or if it would damage
 * the stop alphabet.
 *
 * Infixes:
 * - LBR candidates are not considered.
 *
 * Prefixes:
 * - transient prefixes are not considered.
 * - with a max width or a narrow start are kept segregated by
 *   this phase and can only be merged with similar infixes.
 * - in block mode, merges are only performed if literal sets are the same.
 * - merges are not considered in cases where dot star start state will be
 *   reformed to optimise a leading repeat.
 */
void mergeLeftfixesVariableLag(RoseBuildImpl &tbi) {
    if (!tbi.cc.grey.mergeRose) {
        return;
    }

    map<RoseMergeKey, RoseBouquet> rosesByParent;
    RoseGraph &g = tbi.g;
    RoseVertexSet parents(g);

    DEBUG_PRINTF("-----\n");
    DEBUG_PRINTF("entry\n");
    DEBUG_PRINTF("-----\n");

    for (auto v : vertices_range(g)) {
        if (!g[v].left) {
            continue;
        }

        const bool is_prefix = tbi.isRootSuccessor(v);

        // Only non-transient for the moment.
        if (contains(tbi.transient, g[v].left)) {
            continue;
        }

        // No forced McClellan or Haig infix merges.
        if (g[v].left.dfa || (!is_prefix && g[v].left.haig)) {
            continue;
        }

        if (g[v].left.graph) {
            NGHolder &h = *g[v].left.graph;

            /* Ensure that kind on the graph is correct */
            assert(h.kind == (is_prefix ? NFA_PREFIX : NFA_INFIX));

            if (hasReformedStartDotStar(h, tbi.cc.grey)) {
                continue; // preserve the optimisation of the leading repeat
            }

            if (!is_prefix && isLargeLBR(h, tbi.cc.grey)) {
                continue;
            }
        }

        if (g[v].left.castle && !tbi.cc.grey.allowCastle) {
            DEBUG_PRINTF("castle merging disallowed by greybox\n");
            continue;
        }

        // We collapse the anchored root into the root vertex when calculating
        // parents, so that we can merge differently-anchored prefix roses
        // together. (Prompted by UE-2100)
        parents.clear();
        for (auto u : inv_adjacent_vertices_range(v, g)) {
            if (tbi.isAnyStart(u)) {
                parents.insert(tbi.root);
            } else {
                parents.insert(u);
            }
        }

        if (parents.empty()) {
            assert(0);
            continue;
        }

        // We want to distinguish prefixes (but not infixes) on whether they
        // have a narrow start or max width.
        bool narrowStart = false, hasMaxWidth = false;
        if (is_prefix && g[v].left.graph) {
            const NGHolder &h = *g[v].left.graph;
            narrowStart = nfaHasNarrowStart(h);
            hasMaxWidth = nfaHasFiniteMaxWidth(h);
        }

        RoseMergeKey key(parents, narrowStart, hasMaxWidth);
        rosesByParent[key].insert(g[v].left, v);
    }

    for (auto &m : rosesByParent) {
        if (m.second.size() < 2) {
            continue;
        }

        deque<RoseBouquet> rose_groups;
        chunkBouquets(m.second, rose_groups, MERGE_GROUP_SIZE_MAX);
        m.second.clear();
        DEBUG_PRINTF("chunked roses into %zu groups\n", rose_groups.size());

        for (auto &roses : rose_groups) {
            // All pairs on the prio queue.
            u32 tie_breaker = 0;
            priority_queue<RoseMergeCandidate> pq;
            for (auto it = roses.begin(), ite = roses.end(); it != ite; ++it) {
                left_id r1 = *it;
                const deque<RoseVertex> &verts1 = roses.vertices(r1);

                for (auto jt = next(it); jt != ite; ++jt) {
                    left_id r2 = *jt;

                    // Roses must be of the same engine type to be mergeable.
                    if ((!r1.graph() != !r2.graph()) ||
                        (!r1.castle() != !r2.castle())) {
                        continue;
                    }

                    // Castles must have the same reach to be mergeable.
                    if (r1.castle()) {
                        if (r1.castle()->reach() != r2.castle()->reach()) {
                            continue;
                        }
                    }

                    const deque<RoseVertex> &verts2 = roses.vertices(r2);
                    if (!mergeableRoseVertices(tbi, verts1, verts2)) {
                        continue; // No point queueing unmergeable cases.
                    }

                    u32 cpl = commonPrefixLength(r1, r2);
                    pq.push(RoseMergeCandidate(r1, r2, cpl, tie_breaker++));
                }
            }
            processMergeQueue(tbi, roses, pq);
        }
    }

    DEBUG_PRINTF("-----\n");
    DEBUG_PRINTF("exit\n");
    DEBUG_PRINTF("-----\n");
}

namespace {

/**
 * Key used to group sets of leftfixes for the dedupeLeftfixesVariableLag path.
 */
struct DedupeLeftKey {
    DedupeLeftKey(const RoseBuildImpl &build, RoseVertex v)
        : left_hash(hashLeftfix(build.g[v].left)) {
        const auto &g = build.g;
        for (const auto &e : in_edges_range(v, g)) {
            preds.emplace(g[source(e, g)].idx, g[e].rose_top);
        }
    }

    bool operator<(const DedupeLeftKey &b) const {
        return tie(left_hash, preds) < tie(b.left_hash, b.preds);
    }

private:
    /** Quick hash of the leftfix itself. Must be identical for a given pair of
     * graphs if is_equal would return true. */
    size_t left_hash;

    /** For each in-edge, the pair of (parent index, edge top). */
    set<pair<size_t, u32>> preds;
};

} // namespace

/**
 * This is a generalisation of \ref dedupeLeftfixes which relaxes two
 * restrictions: multiple predecessor roles are allowed and the delay used by
 * each vertex may not be the same for each vertex. Like \ref dedupeLeftfixes,
 * the leftfixes' successor vertices are first grouped to reduce the number of
 * potential candidates - the grouping in this case is by the set of
 * predecessor roles with their associated top events. For the dedupe to be
 * possible, it is required that:
 *
 * 1. the nfa graphs with respect to the relevant reports are identical
 * 2. the nfa graphs are triggered by the same roles with same events (ensured
 *    by the initial grouping pass)
 * 3. all the successor roles of either graph can inspect the combined leftfix
 *    without advancing the state of the leftfix past the point that another
 *    successor may want to inspect it; the overlap relationships between the
 *    involved literals are examined to ensure that this property holds.
 *
 * Note: in block mode we restrict the dedupe of prefixes further as some of
 * logic checks are shared with the mergeLeftfix functions.
 */
void dedupeLeftfixesVariableLag(RoseBuildImpl &tbi) {
    map<DedupeLeftKey, RoseBouquet> roseGrouping;

    DEBUG_PRINTF("entry\n");

    RoseGraph &g = tbi.g;
    for (auto v : vertices_range(g)) {
        if (!g[v].left) {
            continue;
        }

        const left_id leftfix(g[v].left);

        // Only non-transient for the moment.
        if (contains(tbi.transient, leftfix)) {
            continue;
        }

        if (leftfix.haig()) {
            /* TODO: allow merging of identical haigs */
            continue;
        }

        roseGrouping[DedupeLeftKey(tbi, v)].insert(leftfix, v);
    }

    for (RoseBouquet &roses : roseGrouping | map_values) {
        DEBUG_PRINTF("group of %zu roses\n", roses.size());

        if (roses.size() < 2) {
            continue;
        }

        const RoseComparator rosecmp(g);

        for (auto it = roses.begin(); it != roses.end(); ++it) {
            left_id r1 = *it;
            const deque<RoseVertex> &verts1 = roses.vertices(r1);

            for (auto jt = next(it); jt != roses.end(); ++jt) {
                left_id r2 = *jt;
                const deque<RoseVertex> &verts2 = roses.vertices(r2);

                if (!rosecmp(verts1.front(), verts2.front())) {
                    continue;
                }

                if (!mergeableRoseVertices(tbi, verts1, verts2)) {
                    continue;
                }

                DEBUG_PRINTF("%p and %p are dupes\n", r1.graph(), r2.graph());

                // Replace h1 with h2.

                const LeftEngInfo &v2_left = g[verts2.front()].left;
                assert(v2_left.graph.get() == r2.graph());

                for (auto v : verts1) {
                    DEBUG_PRINTF("replacing report %u with %u on %zu\n",
                                 g[v].left.leftfix_report,
                                 v2_left.leftfix_report, g[v].idx);
                    u32 orig_lag = g[v].left.lag;
                    g[v].left = v2_left;
                    g[v].left.lag = orig_lag;
                }
                roses.insert(r2, verts1);
                // no need to erase h1 from roses, that would invalidate `it'.
                break;
            }
        }
    }
}

static
u32 findUnusedTop(const ue2::flat_set<u32> &tops) {
    u32 i = 0;
    while (contains(tops, i)) {
        i++;
    }
    assert(i < NFA_MAX_TOP_MASKS);
    return i;
}

// Replace top 't' on edges with new top 'u'.
static
void replaceTops(NGHolder &h, const map<u32, u32> &top_mapping) {
    for (const auto &e : out_edges_range(h.start, h)) {
        NFAVertex v = target(e, h);
        if (v == h.startDs) {
            continue;
        }
        DEBUG_PRINTF("vertex %u has top %u\n", h[v].index, h[e].top);
        assert(contains(top_mapping, h[e].top));
        h[e].top = top_mapping.at(h[e].top);
    }
}

static
bool setDistinctTops(NGHolder &h1, const NGHolder &h2,
                     map<u32, u32> &top_mapping) {
    ue2::flat_set<u32> tops1 = getTops(h1), tops2 = getTops(h2);

    DEBUG_PRINTF("before: h1 has %zu tops, h2 has %zu tops\n", tops1.size(),
                 tops2.size());

    if (tops1.size() + tops2.size() > NFA_MAX_TOP_MASKS) {
        DEBUG_PRINTF("too many tops!\n");
        return false;
    }

    // If our tops don't intersect, we're OK to merge with no changes.
    if (!has_intersection(tops1, tops2)) {
        DEBUG_PRINTF("tops don't intersect\n");
        return true;
    }

    // Otherwise, we have to renumber the tops in h1 so that they don't overlap
    // with the tops in h2.
    top_mapping.clear();
    for (u32 t : tops1) {
        u32 u = findUnusedTop(tops2);
        DEBUG_PRINTF("replacing top %u with %u in h1\n", t, u);
        top_mapping.insert(make_pair(t, u));
        assert(!contains(tops2, u));
        tops2.insert(u);
    }

    replaceTops(h1, top_mapping);
    return true;
}

bool setDistinctRoseTops(RoseGraph &g, NGHolder &h1, const NGHolder &h2,
                         const deque<RoseVertex> &verts1) {
    map<u32, u32> top_mapping;
    if (!setDistinctTops(h1, h2, top_mapping)) {
        return false;
    }

    if (top_mapping.empty()) {
        return true; // No remapping necessary.
    }

    for (auto v : verts1) {
        DEBUG_PRINTF("vertex %zu\n", g[v].idx);
        assert(!g[v].left.haig);
        assert(!g[v].left.dfa);
        for (const auto &e : in_edges_range(v, g)) {
            u32 t = g[e].rose_top;
            DEBUG_PRINTF("t=%u\n", t);
            assert(contains(top_mapping, t));
            g[e].rose_top = top_mapping[t];
            DEBUG_PRINTF("edge (%zu,%zu) went from top %u to %u\n",
                         g[source(e, g)].idx, g[target(e, g)].idx, t,
                         top_mapping[t]);
        }
    }

    return true;
}

static
bool setDistinctSuffixTops(RoseGraph &g, NGHolder &h1, const NGHolder &h2,
                           const deque<RoseVertex> &verts1) {
    map<u32, u32> top_mapping;
    if (!setDistinctTops(h1, h2, top_mapping)) {
        return false;
    }

    if (top_mapping.empty()) {
        return true; // No remapping necessary.
    }

    for (auto v : verts1) {
        DEBUG_PRINTF("vertex %zu\n", g[v].idx);
        u32 t = g[v].suffix.top;
        assert(contains(top_mapping, t));
        g[v].suffix.top = top_mapping[t];
    }

    return true;
}

static
bool hasMaxTops(const NGHolder &h) {
    return getTops(h).size() == NFA_MAX_TOP_MASKS;
}

/** \brief Estimate the number of accel states in the given graph when built as
 * an NFA.
 *
 * (The easiest way to estimate something like this is to actually build it:
 * the criteria for NFA acceleration are quite complicated and buried in
 * limex_compile.)
 */
static
u32 estimatedAccelStates(const RoseBuildImpl &tbi, const NGHolder &h) {
    return countAccelStates(h, &tbi.rm, tbi.cc);
}

static
void mergeNfaLeftfixes(RoseBuildImpl &tbi, RoseBouquet &roses) {
    RoseGraph &g = tbi.g;
    DEBUG_PRINTF("%zu nfa rose merge candidates\n", roses.size());

    // We track the number of accelerable states for each graph in a map and
    // only recompute them when the graph is modified.
    ue2::unordered_map<left_id, u32> accel_count;
    for (const auto &rose : roses) {
        assert(rose.graph()->kind == NFA_INFIX);
        accel_count[rose] = estimatedAccelStates(tbi, *rose.graph());
    }

    for (auto it = roses.begin(); it != roses.end(); ++it) {
        left_id r1 = *it;
        const deque<RoseVertex> &verts1 = roses.vertices(r1);

        deque<left_id> merged;
        for (auto jt = next(it); jt != roses.end(); ++jt) {
            left_id r2 = *jt;
            const deque<RoseVertex> &verts2 = roses.vertices(r2);

            DEBUG_PRINTF("consider merging rose %p (%zu verts) "
                         "with %p (%zu verts)\n",
                         r1.graph(), verts1.size(), r2.graph(), verts2.size());

            if (hasMaxTops(*r1.graph())) {
                DEBUG_PRINTF("h1 has hit max tops\n");
                break; // next h1
            }

            u32 accel1 = accel_count[r1];
            if (accel1 >= NFA_MAX_ACCEL_STATES) {
                DEBUG_PRINTF("h1 has hit max accel\n");
                break; // next h1
            }

            u32 accel2 = accel_count[r2];
            if (accel1 + accel2 > NFA_MAX_ACCEL_STATES) {
                DEBUG_PRINTF("not merging, might make unaccel (accel1=%u, "
                             "accel2=%u)\n",
                             accel1, accel2);
                continue; // next h2
            }

            if (!mergeableRoseVertices(tbi, verts1, verts2)) {
                DEBUG_PRINTF("not mergeable\n");
                continue; // next h2
            }

            // Attempt to merge h2 into h1.

            NGHolder victim;
            cloneHolder(victim, *r2.graph());

            // Store a copy of the in-edge properties in case we have to roll
            // back.
            map<RoseEdge, RoseEdgeProps> edge_props;
            for (auto v : verts2) {
                for (const auto &e : in_edges_range(v, g)) {
                    edge_props[e] = g[e];
                }
            }

            if (!setDistinctRoseTops(g, victim, *r1.graph(), verts2)) {
                DEBUG_PRINTF("can't set distinct tops\n");
                continue; // next h2
            }

            assert(victim.kind == r1.graph()->kind);
            assert(!generates_callbacks(*r1.graph()));
            if (!mergeNfaPair(victim, *r1.graph(), nullptr, tbi.cc)) {
                DEBUG_PRINTF("merge failed\n");
                // Roll back in-edge properties.
                for (const auto &m : edge_props) {
                    g[m.first] = m.second;
                }
                continue; // next h2
            }

            // Update h2's roses to point to h1 now
            shared_ptr<NGHolder> winner = g[verts1.front()].left.graph;
            for (auto v : verts2) {
                g[v].left.graph = winner;
            }
            roses.insert(r1, verts2);

            merged.push_back(r2);

            if (num_vertices(*winner) >= small_merge_max_vertices(tbi.cc)) {
                DEBUG_PRINTF("h1 now has %zu vertices, proceeding to next\n",
                             num_vertices(*winner));
                break; // next h1
            }

            // Update h1's accel count estimate.
            accel_count[r1] = estimatedAccelStates(tbi, *winner);
        }

        DEBUG_PRINTF("%zu roses merged\n", merged.size());
        roses.erase_all(merged.begin(), merged.end());
    }
}

static
void mergeCastleRoses(RoseBuildImpl &tbi, RoseBouquet &roses) {
    RoseGraph &g = tbi.g;
    DEBUG_PRINTF("%zu castle rose merge candidates\n", roses.size());

    deque<left_id> merged;

    for (auto it = roses.begin(); it != roses.end(); ++it) {
        left_id r1 = *it;
        CastleProto &castle1 = *r1.castle();
        const deque<RoseVertex> &verts1 = roses.vertices(r1);

        merged.clear();

        for (auto jt = next(it); jt != roses.end(); ++jt) {
            left_id r2 = *jt;
            CastleProto &castle2 = *r2.castle();
            const deque<RoseVertex> &verts2 = roses.vertices(r2);

            if (castle1.repeats.size() == castle1.max_occupancy) {
                DEBUG_PRINTF("castle1 has hit max occupancy\n");
                break; // next castle1
            }

            if (castle1.reach() != castle2.reach()) {
                DEBUG_PRINTF("different reach\n");
                continue; // next castle2
            }

            if (!mergeableRoseVertices(tbi, verts1, verts2)) {
                DEBUG_PRINTF("not mergeable\n");
                continue; // next castle2
            }

            DEBUG_PRINTF("castle1=%p (size %zu), castle2=%p (size %zu)\n",
                         &castle1, castle1.repeats.size(), &castle2,
                         castle2.repeats.size());

            map<u32, u32> top_map;
            if (!mergeCastle(castle1, castle2, top_map)) {
                DEBUG_PRINTF("couldn't merge\n");
                continue; // next castle2
            }

            // Update castle2's roses to point to castle1 now.
            shared_ptr<CastleProto> winner = g[verts1.front()].left.castle;
            for (auto v : verts2) {
                g[v].left.castle = winner;
                for (const auto &e : in_edges_range(v, g)) {
                    g[e].rose_top = top_map.at(g[e].rose_top);
                }
            }

            roses.insert(r1, verts2);
            merged.push_back(r2);
        }

        DEBUG_PRINTF("%zu roses merged\n", merged.size());
        roses.erase_all(merged.begin(), merged.end());
    }
}

/**
 * This pass attempts to merge prefix/infix engines with a small number of
 * vertices together into larger engines. The engines must not be have a
 * reformed start dot star (due to a leading repeat) nor an infix LBR. Engines
 * that have compatible lag are greedily grouped such that they remain
 * accelerable and only have a small number of states. Note: if a role has an
 * infix with multiple trigger vertices, the role will be left unchanged by this
 * pass and will remain using an unmerged graph.
 */
void mergeSmallLeftfixes(RoseBuildImpl &tbi) {
    DEBUG_PRINTF("entry\n");

    if (!tbi.cc.grey.mergeRose || !tbi.cc.grey.roseMultiTopRoses) {
        return;
    }

    RoseGraph &g = tbi.g;

    RoseBouquet nfa_roses;

    for (auto v : vertices_range(g)) {
        if (!g[v].left) {
            continue;
        }

        // Handle single-parent infixes only.
        if (tbi.isRootSuccessor(v)) {
            continue;
        }

        left_id left(g[v].left);

        // Only non-transient for the moment.
        if (contains(tbi.transient, left)) {
            continue;
        }

        // No DFAs or Haigs right now.
        if (left.dfa() || left.haig()) {
            continue;
        }

        // Castles are handled by a different pass.
        if (left.castle()) {
            continue;
        }

        assert(left.graph());
        NGHolder &h = *left.graph();

        /* Ensure that kind on the graph is correct */
        assert(h.kind == (tbi.isRootSuccessor(v) ? NFA_PREFIX : NFA_INFIX));

        if (hasReformedStartDotStar(h, tbi.cc.grey)) {
            /* We would lose optimisations of the leading repeat by merging. */
            continue;
        }

        // Don't merge cases that will become LBRs or haigs.
        if (isLargeLBR(h, tbi.cc.grey)) {
            continue;
        }

        // Small roses only.
        if (num_vertices(h) > small_rose_threshold(tbi.cc)) {
            continue;
        }

        nfa_roses.insert(left, v);
    }

    deque<RoseBouquet> rose_groups;
    chunkBouquets(nfa_roses, rose_groups, MERGE_GROUP_SIZE_MAX);
    nfa_roses.clear();
    DEBUG_PRINTF("chunked nfa roses into %zu groups\n", rose_groups.size());

    for (auto &group : rose_groups) {
        mergeNfaLeftfixes(tbi, group);
    }
}

void mergeCastleLeftfixes(RoseBuildImpl &tbi) {
    DEBUG_PRINTF("entry\n");

    if (!tbi.cc.grey.mergeRose || !tbi.cc.grey.roseMultiTopRoses ||
        !tbi.cc.grey.allowCastle) {
        return;
    }

    RoseGraph &g = tbi.g;

    map<CharReach, RoseBouquet> by_reach;

    for (auto v : vertices_range(g)) {
        if (!g[v].left) {
            continue;
        }

        // Handle single-parent infixes only.
        if (tbi.isRootSuccessor(v)) {
            continue;
        }

        const left_id left(g[v].left);

        // Only non-transient for the moment.
        if (contains(tbi.transient, left)) {
            continue;
        }

        if (!left.castle()) {
            continue;
        }

        const CastleProto &castle = *left.castle();
        const CharReach &cr = castle.reach();
        by_reach[cr].insert(left, v);
    }

    for (auto &m : by_reach) {
        DEBUG_PRINTF("%zu castles for reach: %s\n", m.second.size(),
                     describeClass(m.first).c_str());
        RoseBouquet &roses = m.second;
        deque<RoseBouquet> rose_groups;
        chunkBouquets(roses, rose_groups, MERGE_GROUP_SIZE_MAX);
        roses.clear();

        for (auto &group : rose_groups) {
            mergeCastleRoses(tbi, group);
        }
    }
}

static
void mergeSuffixes(RoseBuildImpl &tbi, SuffixBouquet &suffixes,
                   const bool acyclic) {
    RoseGraph &g = tbi.g;

    DEBUG_PRINTF("group has %zu suffixes\n", suffixes.size());

    // If this isn't an acyclic case, we track the number of accelerable states
    // for each graph in a map and only recompute them when the graph is
    // modified.
    ue2::unordered_map<suffix_id, u32> accel_count;
    if (!acyclic) {
        for (const auto &suffix : suffixes) {
            assert(suffix.graph() && suffix.graph()->kind == NFA_SUFFIX);
            accel_count[suffix] = estimatedAccelStates(tbi, *suffix.graph());
        }
    }

    for (auto it = suffixes.begin(); it != suffixes.end(); ++it) {
        suffix_id s1 = *it;
        const deque<RoseVertex> &verts1 = suffixes.vertices(s1);
        assert(s1.graph() && s1.graph()->kind == NFA_SUFFIX);
        deque<suffix_id> merged;
        for (auto jt = next(it); jt != suffixes.end(); ++jt) {
            suffix_id s2 = *jt;
            const deque<RoseVertex> &verts2 = suffixes.vertices(s2);
            assert(s2.graph() && s2.graph()->kind == NFA_SUFFIX);

            if (hasMaxTops(*s1.graph())) {
                DEBUG_PRINTF("h1 has hit max tops\n");
                break; // next h1
            }

            if (!acyclic) {
                u32 accel1 = accel_count[s1];
                if (accel1 >= NFA_MAX_ACCEL_STATES) {
                    DEBUG_PRINTF("h1 has hit max accel\n");
                    break; // next h1
                }

                u32 accel2 = accel_count[s2];
                if (accel1 + accel2 > NFA_MAX_ACCEL_STATES) {
                    DEBUG_PRINTF("not merging, might make unaccel (accel1=%u, "
                                 "accel2=%u)\n",
                                 accel1, accel2);
                    continue; // next h2
                }
            }

            // Attempt to merge h2 into h1.

            NGHolder victim;
            cloneHolder(victim, *s2.graph());

            // Store a copy of the suffix tops in case we have to roll back.
            map<RoseVertex, u32> old_tops;
            for (auto v : verts2) {
                old_tops[v] = g[v].suffix.top;
            }

            if (!setDistinctSuffixTops(g, victim, *s1.graph(), verts2)) {
                DEBUG_PRINTF("can't set distinct tops\n");
                continue; // next h2
            }

            if (!mergeNfaPair(victim, *s1.graph(), &tbi.rm, tbi.cc)) {
                DEBUG_PRINTF("merge failed\n");
                // Roll back in-edge properties.
                for (const auto &m : old_tops) {
                    g[m.first].suffix.top = m.second;
                }
                continue; // next h2
            }

            // Update h2's roses to point to h1 now
            shared_ptr<NGHolder> winner = g[verts1.front()].suffix.graph;
            for (auto v : verts2) {
                g[v].suffix.graph = winner;
            }
            suffixes.insert(s1, verts2);
            merged.push_back(s2);

            if (num_vertices(*s1.graph()) >= small_merge_max_vertices(tbi.cc)) {
                DEBUG_PRINTF("h1 now has %zu vertices, proceeding to next\n",
                             num_vertices(*s1.graph()));
                break; // next h1
            }

            if (!acyclic) {
                // Update h1's accel count estimate.
                accel_count[s1] = estimatedAccelStates(tbi, *s1.graph());
            }
        }

        DEBUG_PRINTF("%zu suffixes merged\n", merged.size());
        suffixes.erase_all(merged.begin(), merged.end());
    }
}

/**
 * This merge pass combines suffixes from unrelated roles into a single
 * suffix with multiple top events in order to distinguish the triggers
 * from differing roles. mergeAcyclicSuffixes only considers acyclic suffixes
 * while mergeSmallSuffixes only considers small suffixes. The merges will
 * group roles with suffixes in the graph into clusters of at most
 * \ref MERGE_GROUP_SIZE_MAX. Each cluster is processed by iterating over the
 * suffixes and attempting to pairwise merge it with another member. Merges
 * will fail if the result is not implementable, requires too many distinct top
 * events, or if it losses the ability to be accelerated. The merge will modify
 * the existing suffix graph of the one member (g1), the other member updates
 * it graph to refer to g1 instead of its previous graph (g2) and use the new
 * tops created. Other roles may have been sharing g1 - these are unaffected by
 * the change as the existing top events are left untouched. Other roles using
 * g2 are also unaffected as g2 will continue to exist until while it has any
 * roles triggering it.
 *
 * Note: suffixes destined for the LBR are not considered for these merges as
 * the LBR can only handle a single repeat and this type of repeat is ideally
 * handled outside of an NFA or DFA.
 */
void mergeAcyclicSuffixes(RoseBuildImpl &tbi) {
    DEBUG_PRINTF("entry\n");

    if (!tbi.cc.grey.mergeSuffixes) {
        return;
    }

    SuffixBouquet suffixes;

    RoseGraph &g = tbi.g;

    for (auto v : vertices_range(g)) {
        shared_ptr<NGHolder> h = g[v].suffix.graph;
        if (!h || tbi.isInETable(v)) {
            continue;
        }

        assert(!g[v].suffix.haig);

        if (!isAcyclic(*h)) {
            continue;
        }

        if (isLargeLBR(*h, tbi.cc.grey)) {
            DEBUG_PRINTF("not considering LBR suffix for merge\n");
            continue;
        }

        suffixes.insert(g[v].suffix, v);
    }

    deque<SuffixBouquet> suff_groups;
    chunkBouquets(suffixes, suff_groups, MERGE_GROUP_SIZE_MAX);
    DEBUG_PRINTF("chunked %zu suffixes into %zu groups\n", suffixes.size(),
                 suff_groups.size());
    suffixes.clear();

    for (auto &group : suff_groups) {
        mergeSuffixes(tbi, group, true);
    }
}

/**
 * This merge pass combines suffixes from unrelated roles into a single
 * suffix with multiple top events in order to distinguish the triggers
 * from differing roles. mergeAcyclicSuffixes only considers acyclic suffixes
 * while mergeSmallSuffixes only considers small suffixes. The merges will
 * group roles with suffixes in the graph into clusters of at most
 * \ref MERGE_GROUP_SIZE_MAX. Each cluster is processed by iterating over the
 * suffixes and attempting to pairwise merge it with another member. Merges
 * will fail if the result is not implementable, requires too many distinct top
 * events, or if it losses the ability to be accelerated. The merge will modify
 * the existing suffix graph of the one member (g1), the other member updates
 * it graph to refer to g1 instead of its previous graph (g2) and use the new
 * tops created. Other roles may have been sharing g1 - these are unaffected by
 * the change as the existing top events are left untouched. Other roles using
 * g2 are also unaffected as g2 will continue to exist until while it has any
 * roles triggering it.
 *
 * Note: suffixes destined for the LBR are not considered for these merges as
 * the LBR can only handle a single repeat and this type of repeat is ideally
 * handled outside of an NFA or DFA.
 */
void mergeSmallSuffixes(RoseBuildImpl &tbi) {
    DEBUG_PRINTF("entry\n");

    if (!tbi.cc.grey.mergeSuffixes) {
        return;
    }

    RoseGraph &g = tbi.g;
    SuffixBouquet suffixes;

    for (auto v : vertices_range(g)) {
        shared_ptr<NGHolder> h = g[v].suffix.graph;
        if (!h || tbi.isInETable(v)) {
            continue;
        }
        assert(!g[v].suffix.haig);

        // Leave acyclics out for the moment.
        if (isAcyclic(*h)) {
            continue;
        }

        // Small-ish suffixes only.
        if (num_vertices(*h) > 32) {
            continue;
        }

        if (isLargeLBR(*h, tbi.cc.grey)) {
            DEBUG_PRINTF("not considering LBR suffix for merge\n");
            continue;
        }

        suffixes.insert(g[v].suffix, v);
    }

    deque<SuffixBouquet> suff_groups;
    chunkBouquets(suffixes, suff_groups, MERGE_GROUP_SIZE_MAX);
    DEBUG_PRINTF("chunked %zu suffixes into %zu groups\n", suffixes.size(),
                 suff_groups.size());
    suffixes.clear();

    for (auto &group : suff_groups) {
        mergeSuffixes(tbi, group, false);
    }
}

static
void removeDeadOutfixes(vector<OutfixInfo> &outfixes) {
    auto is_dead = [](const OutfixInfo &outfix) { return outfix.is_dead(); };
    outfixes.erase(remove_if(begin(outfixes), end(outfixes), is_dead),
                   end(outfixes));
}

static
void mergeOutfixInfo(OutfixInfo &winner, const OutfixInfo &victim) {
    assert(!winner.is_dead());

    winner.maxBAWidth = max(winner.maxBAWidth, victim.maxBAWidth);
    winner.minWidth = min(winner.minWidth, victim.minWidth);
    winner.maxWidth = max(winner.maxWidth, victim.maxWidth);
    winner.maxOffset = max(winner.maxOffset, victim.maxOffset);
    mergeReverseAccelerationInfo(winner.rev_info, victim.rev_info);

    // This outfix can be ignored in small block mode if both were. The dedupe
    // layer at runtime will protect us from extra matches if only one was in
    // the small block matcher.
    winner.in_sbmatcher &= victim.in_sbmatcher;

    // We should never have merged outfixes that differ in these properties.
    assert(winner.chained == victim.chained);
}

static
map<NGHolder *, NGHolder *> chunkedNfaMerge(RoseBuildImpl &build,
                                            const vector<NGHolder *> &nfas) {
    map<NGHolder *, NGHolder *> merged;

    vector<NGHolder *> batch;
    for (auto it = begin(nfas), ite = end(nfas); it != ite; ++it) {
        batch.push_back(*it);
        assert((*it)->kind == NFA_OUTFIX);
        if (batch.size() == MERGE_GROUP_SIZE_MAX || next(it) == ite) {
            mergeNfaCluster(batch, &build.rm, merged, build.cc);
            batch.clear();
        }
    }

    return merged;
}

static
void mergeOutfixNfas(RoseBuildImpl &tbi, vector<NGHolder *> &nfas) {
    DEBUG_PRINTF("merging %zu nfas\n", nfas.size());
    if (nfas.size() < 2) {
        return;
    }

    vector<OutfixInfo> &outfixes = tbi.outfixes;

    map<NGHolder *, size_t> nfa_mapping;
    for (size_t i = 0; i < outfixes.size(); i++) {
        if (outfixes[i].holder) {
            nfa_mapping[outfixes[i].holder.get()] = i;
        }
    }

    map<NGHolder *, NGHolder *> merged = chunkedNfaMerge(tbi, nfas);
    if (merged.empty()) {
        return;
    }

    DEBUG_PRINTF("%zu nfas merged\n", merged.size());

    // Update the outfix info for merged holders.
    for (const auto &m : merged) {
        OutfixInfo &victim = outfixes.at(nfa_mapping[m.first]);
        OutfixInfo &winner = outfixes.at(nfa_mapping[m.second]);
        mergeOutfixInfo(winner, victim);
        victim.clear();
    }

    removeDeadOutfixes(outfixes);
}

namespace {
struct MergeMcClellan {
    MergeMcClellan(const ReportManager &rm_in, const Grey &grey_in)
        : rm(rm_in), grey(grey_in) {}

    unique_ptr<raw_dfa> operator()(const raw_dfa *d1, const raw_dfa *d2) const {
        assert(d1 && d2);
        return mergeTwoDfas(d1, d2, DFA_MERGE_MAX_STATES, &rm, grey);
    }

    static void transfer(OutfixInfo &outfix, unique_ptr<raw_dfa> d) {
        outfix.rdfa = move(d);
    }

private:
    const ReportManager &rm;
    const Grey &grey;
};

struct MergeHaig {
    explicit MergeHaig(u32 limit_in) : limit(limit_in) {}

    unique_ptr<raw_som_dfa> operator()(const raw_som_dfa *d1,
                                       const raw_som_dfa *d2) const {
        assert(d1 && d2);
        return attemptToMergeHaig({d1, d2}, limit);
    }

    static void transfer(OutfixInfo &outfix, unique_ptr<raw_som_dfa> d) {
        outfix.haig = move(d);
    }

private:
    const u32 limit; //!< state limit for merged result.
};
}

/**
 * Generic pairwise merge algorithm that can be used for either McClellan
 * (RawDfa=raw_dfa) or Haig (RawDfa=raw_som_dfa). Delegates the actual merge
 * operation to a merge functor, which allows the caller to set some policy
 * (state limits, etc).
 *
 * This is currently astonishingly simple and just considers every pair of
 * DFAs, slow and steady. We may wish to actually apply a merge ordering
 * strategy in the future.
 */
template<class RawDfa, class MergeFunctor>
static
void pairwiseDfaMerge(vector<RawDfa *> &dfas,
                      ue2::unordered_map<RawDfa *, size_t> &dfa_mapping,
                      vector<OutfixInfo> &outfixes,
                      MergeFunctor merge_func) {
    DEBUG_PRINTF("merging group of size %zu\n", dfas.size());

    for (auto it = dfas.begin(), ite = dfas.end(); it != ite; ++it) {
        if (!*it) {
            continue;
        }
        for (auto jt = next(it); jt != ite; ++jt) {
            if (!*jt) {
                continue;
            }

            DEBUG_PRINTF("try merge %p and %p\n", *it, *jt);
            unique_ptr<RawDfa> rdfa = merge_func(*it, *jt);
            if (!rdfa) {
                continue; // Merge failed.
            }

            DEBUG_PRINTF("merge succeeded, built %p\n", rdfa.get());
            OutfixInfo &winner = outfixes.at(dfa_mapping[*it]);
            OutfixInfo &victim = outfixes.at(dfa_mapping[*jt]);
            assert(!winner.is_dead() && !victim.is_dead());

            RawDfa *dfa_ptr = rdfa.get();
            dfa_mapping[dfa_ptr] = dfa_mapping[*it];
            dfa_mapping.erase(*it);
            merge_func.transfer(winner, move(rdfa));

            mergeOutfixInfo(winner, victim);

            victim.clear();
            *jt = nullptr; // to be deleted.
            *it = dfa_ptr;
        }
    }
}

template<class RawDfa, class MergeFunctor>
static
void chunkedDfaMerge(vector<RawDfa *> &dfas,
                     ue2::unordered_map<RawDfa *, size_t> &dfa_mapping,
                     vector<OutfixInfo> &outfixes,
                     MergeFunctor merge_func) {
    DEBUG_PRINTF("begin merge of %zu dfas\n", dfas.size());

    vector<RawDfa *> out_dfas;
    vector<RawDfa *> chunk;
    for (auto it = begin(dfas), ite = end(dfas); it != ite; ++it) {
        chunk.push_back(*it);
        if (chunk.size() >= DFA_CHUNK_SIZE_MAX || next(it) == ite) {
            pairwiseDfaMerge(chunk, dfa_mapping, outfixes, merge_func);
            out_dfas.insert(end(out_dfas), begin(chunk), end(chunk));
            chunk.clear();
        }
    }

    // Remove null (merged) DFAs and update vector for subsequent use.
    out_dfas.erase(remove(out_dfas.begin(), out_dfas.end(), nullptr),
                   out_dfas.end());
    dfas.swap(out_dfas);
    DEBUG_PRINTF("after merge there are %zu dfas\n", dfas.size());
}

static
void mergeOutfixDfas(RoseBuildImpl &tbi, vector<raw_dfa *> &dfas) {
    DEBUG_PRINTF("merging %zu nfas\n", dfas.size());
    if (dfas.size() < 2) {
        return;
    }

    vector<OutfixInfo> &outfixes = tbi.outfixes;

    /* key is index into outfix array as iterators, etc may be invalidated by
     * element addition. */
    ue2::unordered_map<raw_dfa *, size_t> dfa_mapping;
    for (size_t i = 0; i < outfixes.size(); i++) {
        if (outfixes[i].rdfa) {
            dfa_mapping[outfixes[i].rdfa.get()] = i;
        }
    }

    chunkedDfaMerge(dfas, dfa_mapping, outfixes,
                    MergeMcClellan(tbi.rm, tbi.cc.grey));
    removeDeadOutfixes(outfixes);
}

static
void mergeOutfixCombo(RoseBuildImpl &tbi, const ReportManager &rm,
                      const Grey &grey) {
    if (!grey.roseMcClellanOutfix) {
        return;
    }

    DEBUG_PRINTF("merge combo\n");

    bool seen_dfa = false;
    u32 nfa_count = 0;
    for (const auto &outfix : tbi.outfixes) {
        if (outfix.holder) {
            DEBUG_PRINTF("nfa\n");
            nfa_count++;
        } else if (outfix.rdfa) {
            DEBUG_PRINTF("dfa\n");
            seen_dfa = true;
        }
    }

    DEBUG_PRINTF("nfa %u dfas present %d\n", nfa_count,
                  (int)seen_dfa);
    if (!nfa_count || (nfa_count == 1 && !seen_dfa)) {
        DEBUG_PRINTF("no combo merges possible\n");
        return;
    }

    /* key is index into outfix array as iterators, etc may be invalidated by
     * element addition. */
    size_t new_dfas = 0;
    ue2::unordered_map<raw_dfa *, size_t> dfa_mapping;
    vector<raw_dfa *> dfas;

    for (auto it = tbi.outfixes.begin(); it != tbi.outfixes.end(); ++it) {
        assert(!it->is_dead());
        if (it->nfa) {
            assert(!it->rdfa && !it->holder && !it->haig);
            continue;
        }
        assert(!it->chained);
        if (it->rdfa) {
            dfas.push_back(it->rdfa.get());
            dfa_mapping[it->rdfa.get()] = it - tbi.outfixes.begin();
            continue;
        }

        if (!it->holder) {
            continue;
        }

        NGHolder *h = it->holder.get();
        assert(h->kind == NFA_OUTFIX);
        auto rdfa = buildMcClellan(*h, &rm, grey);
        if (rdfa) {
            // Transform this outfix into a DFA and add it to the merge set.
            dfa_mapping[rdfa.get()] = it - tbi.outfixes.begin();
            dfas.push_back(rdfa.get());
            it->clear();
            it->rdfa = move(rdfa);
            new_dfas++;
        }
    }

    DEBUG_PRINTF("constructed %zu new dfas\n", new_dfas);

    if (!new_dfas) {
        /* assumes normal dfas have already been fully merged */
        return;
    }

    chunkedDfaMerge(dfas, dfa_mapping, tbi.outfixes,
                    MergeMcClellan(tbi.rm, tbi.cc.grey));
    removeDeadOutfixes(tbi.outfixes);
}

static
void mergeOutfixHaigs(RoseBuildImpl &tbi, vector<raw_som_dfa *> &dfas,
                      u32 limit) {
    if (dfas.size() < 2) {
        return;
    }

    vector<OutfixInfo> &outfixes = tbi.outfixes;

    ue2::unordered_map<raw_som_dfa *, size_t> dfa_mapping;
    for (size_t i = 0; i < outfixes.size(); i++) {
        if (outfixes[i].haig) {
            dfa_mapping[outfixes[i].haig.get()] = i;
        }
    }

    chunkedDfaMerge(dfas, dfa_mapping, outfixes, MergeHaig(limit));
    removeDeadOutfixes(outfixes);
}

/**
 * This pass attempts to merge outfix engines together. At this point in time,
 * the engine type (NFA, DFA, Haig) has already been decided for each outfix
 * and outfixes can only merged with others of their same type. NFAs are merged
 * in a priority order based on common prefix length. The other types are
 * merged blindly. Engines are merged to the extent that they can still be
 * implemented efficiently.
 */
void mergeOutfixes(RoseBuildImpl &tbi) {
    if (!tbi.cc.grey.mergeOutfixes) {
        return;
    }

    vector<NGHolder *> nfas;
    vector<raw_dfa *> dfas;
    vector<raw_som_dfa *> som_dfas;

    for (const auto &outfix : tbi.outfixes) {
        if (outfix.nfa) {
            assert(!outfix.rdfa && !outfix.holder && !outfix.haig);
            continue;
        }
        assert(!outfix.chained);
        if (outfix.rdfa) {
            dfas.push_back(outfix.rdfa.get());
        } else if (outfix.holder) {
            nfas.push_back(outfix.holder.get());
        } else if (outfix.haig) {
            som_dfas.push_back(outfix.haig.get());
        }
    }

    DEBUG_PRINTF("merging %zu dfas, %zu nfas\n",
                 dfas.size(), nfas.size());

    mergeOutfixNfas(tbi, nfas);
    mergeOutfixDfas(tbi, dfas);
    mergeOutfixHaigs(tbi, som_dfas, 255);
    mergeOutfixHaigs(tbi, som_dfas, 8192);
    mergeOutfixCombo(tbi, tbi.rm, tbi.cc.grey);
}

static
u32 allowedSquashDistance(const CharReach &cr, u32 min_width,
                          const RoseBuildImpl &tbi,
                          RoseVertex tv) {
    CharReach accept_cr;
    DEBUG_PRINTF("hello |cr|=%zu\n", cr.count());

    const RoseGraph &g = tbi.g;

    /* TODO: inspect further back in the pattern */
    for (u32 lit_id : g[tv].literals) {
        const rose_literal_id &lit = tbi.literals.right.at(lit_id);
        if (lit.delay) {
            return 0; /* TODO: better */
        }
        if (lit.table != ROSE_FLOATING && lit.table != ROSE_EOD_ANCHORED) {
            return 0;
        }
        assert(!lit.s.empty());
        accept_cr |= *lit.s.rbegin();
    }

    DEBUG_PRINTF("|accept_cr|=%zu\n", accept_cr.count());

    if ((accept_cr & cr).any()) {
        DEBUG_PRINTF("no squash\n");
        return 0; /* the accept byte doesn't always kill the puffette. TODO:
                   * maybe if we look further back we could find something that
                   * would kill the puffette... */
    }

    DEBUG_PRINTF("allowed to squash %u\n", min_width);
    return min_width;
}

void mergePuffixes(RoseBuildImpl &tbi) {
    DEBUG_PRINTF("entry\n");

    if (!tbi.cc.grey.mergeSuffixes) {
        return;
    }

    RoseGraph &g = tbi.g;

    for (auto v : vertices_range(g)) {
        shared_ptr<NGHolder> h = g[v].suffix.graph;
        if (!h) {
            continue;
        }
        assert(!g[v].suffix.haig);
        assert(!g[v].eod_accept);

        assert(onlyOneTop(*h)); /* we should not have merged yet */
        bool fixed_depth = g[v].min_offset == g[v].max_offset;

        if (!isPuffable(*h, fixed_depth, tbi.rm, tbi.cc.grey)) {
            continue;
        }

        PureRepeat repeat;
        if (!isPureRepeat(*h, repeat)) {
            assert(0);
            continue;
        }

        if (repeat.bounds.min == depth(0)) {
            assert(0); // No vacuous puffs allowed.
            continue;
        }

        assert(repeat.bounds.min.is_finite() &&
               repeat.bounds.max.is_reachable());
        assert(repeat.bounds.max == repeat.bounds.min ||
               repeat.bounds.max.is_infinite());

        const bool unbounded = repeat.bounds.max.is_infinite();
        const set<ReportID> reports = all_reports(*h);
        assert(reports.size() == 1);
        ReportID report = *reports.begin();

        DEBUG_PRINTF("got puffette candidate %u:%s\n", report,
                     repeat.bounds.str().c_str());

        raw_puff rp(repeat.bounds.min, unbounded, report, repeat.reach);

        u32 queue;
        u32 event;
        tbi.addChainTail(rp, &queue, &event);
        u32 squashDistance =
            allowedSquashDistance(repeat.reach, repeat.bounds.min, tbi, v);

        Report ir = makeRoseTrigger(event, squashDistance);
        ReportID id = tbi.rm.getInternalId(ir);

        DEBUG_PRINTF("puffette event q%u t%u\n", queue, event);
        g[v].suffix.reset();
        g[v].reports.insert(id);
    }
}

static
void updateCastleSuffix(RoseGraph &g, const shared_ptr<CastleProto> &m,
                        u32 top, const vector<RoseVertex> &verts) {
    DEBUG_PRINTF("merged in as top %u, updating %zu vertices\n", top,
                  verts.size());

    for (auto v : verts) {
        assert(g[v].suffix.castle);
        g[v].suffix.castle = m;
        g[v].suffix.top = top;
    }
}

static
void mergeCastleSuffixes(RoseBuildImpl &tbi,
            vector<shared_ptr<CastleProto> > &castles,
            map<shared_ptr<CastleProto>, vector<RoseVertex> > &castle_map) {
    if (castles.size() <= 1) {
        return;
    }

    RoseGraph &g = tbi.g;
    const size_t max_size = CastleProto::max_occupancy;

    shared_ptr<CastleProto> m = castles.front();
    assert(m->repeats.size() == 1); // Not yet merged.

    // Cache repeats we've already merged, mapped to (prototype, top). That
    // way, we can ensure that we don't construct more than one completely
    // identical repeat.
    typedef map<PureRepeat, pair<shared_ptr<CastleProto>, u32> > RepeatCache;
    RepeatCache cache;
    {
        // Initial entry in cache.
        const u32 top = m->repeats.begin()->first;
        const PureRepeat &pr = m->repeats.begin()->second;
        cache[pr] = make_pair(m, top);
    }

    for (size_t i = 1; i < castles.size(); i++) {
        shared_ptr<CastleProto> c = castles[i];
        assert(c->repeats.size() == 1); // Not yet merged.
        const PureRepeat &pr = c->repeats.begin()->second;
        RepeatCache::const_iterator it = cache.find(pr);
        if (it != cache.end()) {
            DEBUG_PRINTF("reusing cached merge, top=%u, proto=%p\n",
                         it->second.second, it->second.first.get());
            updateCastleSuffix(g, it->second.first, it->second.second,
                               castle_map[c]);
            continue;
        }

        if (m->repeats.size() == max_size) {
            // No room left to merge into 'm'. This one becomes the new 'm'.
            DEBUG_PRINTF("next mergee\n");
            m = c;
            u32 top = m->repeats.begin()->first;
            cache[pr] = make_pair(m, top);
        } else {
            u32 top = m->add(pr);
            updateCastleSuffix(g, m, top, castle_map[c]);
            DEBUG_PRINTF("added to %p, top %u\n", m.get(), top);
            cache[pr] = make_pair(m, top);
        }
    }
}

void mergeCastleSuffixes(RoseBuildImpl &tbi) {
    DEBUG_PRINTF("entry\n");

    if (!(tbi.cc.grey.allowCastle && tbi.cc.grey.mergeSuffixes)) {
        return;
    }

    map<shared_ptr<CastleProto>, vector<RoseVertex>> castles;
    map<CharReach, vector<shared_ptr<CastleProto>>> by_reach;

    RoseGraph &g = tbi.g;

    for (auto v : vertices_range(g)) {
        if (!g[v].suffix.castle) {
            continue;
        }

        shared_ptr<CastleProto> c = g[v].suffix.castle;

        if (c->repeats.size() != 1) {
            // This code assumes it's the only place merging is being done.
            assert(0);
            continue;
        }

        if (!contains(castles, c)) {
            by_reach[c->reach()].push_back(c);
        }
        castles[c].push_back(v);
    }

    for (auto &m : by_reach) {
        DEBUG_PRINTF("reach %s, %zu elements\n", describeClass(m.first).c_str(),
                     m.second.size());
        mergeCastleSuffixes(tbi, m.second, castles);
    }
}

} // namespace ue2
